import numpy as np #linear algebra
import pandas as pd # data processing,CSV file I/O(e.g pd.read_csv)
import seaborn as sns # for statistical data visualization
import matplotlib.pyplot as mtp # for data visualization
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib inline
data = pd.read_csv(r"C:\Users\laxma\Downloads\affairs.csv")
data.head()
| Unnamed: 0 | rate_marriage | age | yrs_married | children | religious | educ | occupation | occupation_husb | affairs | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3.0 | 32.0 | 9.0 | 3.0 | 3.0 | 17.0 | 2.0 | 5.0 | 0.111111 |
| 1 | 1 | 3.0 | 27.0 | 13.0 | 3.0 | 1.0 | 14.0 | 3.0 | 4.0 | 3.230769 |
| 2 | 2 | 4.0 | 22.0 | 2.5 | 0.0 | 1.0 | 16.0 | 3.0 | 5.0 | 1.400000 |
| 3 | 3 | 4.0 | 37.0 | 16.5 | 4.0 | 3.0 | 16.0 | 5.0 | 5.0 | 0.727273 |
| 4 | 4 | 5.0 | 27.0 | 9.0 | 1.0 | 1.0 | 14.0 | 3.0 | 4.0 | 4.666666 |
data.tail()
| Unnamed: 0 | rate_marriage | age | yrs_married | children | religious | educ | occupation | occupation_husb | affairs | |
|---|---|---|---|---|---|---|---|---|---|---|
| 6361 | 6361 | 5.0 | 32.0 | 13.0 | 2.0 | 3.0 | 17.0 | 4.0 | 3.0 | 0.0 |
| 6362 | 6362 | 4.0 | 32.0 | 13.0 | 1.0 | 1.0 | 16.0 | 5.0 | 5.0 | 0.0 |
| 6363 | 6363 | 5.0 | 22.0 | 2.5 | 0.0 | 2.0 | 14.0 | 3.0 | 1.0 | 0.0 |
| 6364 | 6364 | 5.0 | 32.0 | 6.0 | 1.0 | 3.0 | 14.0 | 3.0 | 4.0 | 0.0 |
| 6365 | 6365 | 4.0 | 22.0 | 2.5 | 0.0 | 2.0 | 16.0 | 2.0 | 4.0 | 0.0 |
data.duplicated().sum()
0
data.columns
Index(['Unnamed: 0', 'rate_marriage', 'age', 'yrs_married', 'children',
'religious', 'educ', 'occupation', 'occupation_husb', 'affairs'],
dtype='object')
#VISUALIZATION
plt.bar(data['rate_marriage'],data['age'])
plt.xticks(rotation=90)
plt.show()
fig=px.bar(data,x='yrs_married',y='age',color='yrs_married')
fig.show()
plt.scatter(data['children'],data['rate_marriage'],color='red')
plt.xticks(rotation=90)
plt.show()
plt.figure(figsize=(10,4))
sns.countplot(x='educ', data=data, color='b')
plt.show()
plt.figure(figsize=(10,6))
top_car = data['children'].value_counts().nlargest(10)
sns.countplot(y=data.children,color='cyan')
<AxesSubplot:xlabel='count', ylabel='children'>
sns.lineplot(x='religious', y='Unnamed: 0', data=data)
<AxesSubplot:xlabel='religious', ylabel='Unnamed: 0'>
sns.barplot(data['occupation'],data['age'],color='r')
plt.xticks(rotation=90)
plt.show()
D:\anaconda files\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
plt.figure(figsize=(8, 4))
sns.scatterplot(data=data, x='yrs_married', y='educ')
plt.xlabel('yrs_married')
plt.ylabel('educ')
plt.show()
sns.displot(data["occupation_husb"])
<seaborn.axisgrid.FacetGrid at 0x280c52710a0>
sns.relplot(x='rate_marriage',y='occupation',data=data)
<seaborn.axisgrid.FacetGrid at 0x280c5030640>
sns.countplot(x='occupation',data=data)
plt.xticks(rotation=90)
(array([0, 1, 2, 3, 4, 5]), [Text(0, 0, '1.0'), Text(1, 0, '2.0'), Text(2, 0, '3.0'), Text(3, 0, '4.0'), Text(4, 0, '5.0'), Text(5, 0, '6.0')])
sns.boxplot(x='rate_marriage',y='affairs',data=data)
<AxesSubplot:xlabel='rate_marriage', ylabel='affairs'>
sns.violinplot(x='occupation',y='occupation_husb',data=data)
<AxesSubplot:xlabel='occupation', ylabel='occupation_husb'>
#MODEL BUILDING
x = data.iloc[:,[4,5]].values
import scipy.cluster.hierarchy as shc
dendro = shc.dendrogram(shc.linkage(x, method='ward'))
mtp.title('Dendrogram Plot')
mtp.ylabel('Euclidean Distance')
mtp.xlabel('Customer')
mtp.show()
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters=5, affinity='euclidean',linkage='ward')
y_pred=hc.fit_predict(x)
y_pred
array([0, 2, 3, ..., 3, 1, 3], dtype=int64)
mtp.scatter(x[y_pred == 0, 0], x[y_pred == 0,1], s = 100, c = 'blue', label = 'Cluster 1')
mtp.scatter(x[y_pred == 1, 0], x[y_pred == 1,1], s = 100, c = 'red', label = 'Cluster 2')
mtp.scatter(x[y_pred== 2, 0], x[y_pred == 2,1], s = 100, c = 'cyan', label = 'Cluster 3')
mtp.scatter(x[y_pred == 3, 0], x[y_pred == 3,1], s = 100, c = 'black', label = 'Cluster 4')
mtp.scatter(x[y_pred == 4, 0], x[y_pred == 4,1], s = 100, c = 'orange', label = 'Cluster 5')
mtp.title('cluster of yrs married')
mtp.xlabel('Children')
mtp.legend()
mtp.show()